import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
/Applications/anaconda3/lib/python3.8/site-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2)
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of "
df=pd.read_csv("/Users/abelabykuriakose/downloads/transaction_anomalies_dataset.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Transaction_ID 1000 non-null object 1 Transaction_Amount 1000 non-null float64 2 Transaction_Volume 1000 non-null int64 3 Average_Transaction_Amount 1000 non-null float64 4 Frequency_of_Transactions 1000 non-null int64 5 Time_Since_Last_Transaction 1000 non-null int64 6 Day_of_Week 1000 non-null object 7 Time_of_Day 1000 non-null object 8 Age 1000 non-null int64 9 Gender 1000 non-null object 10 Income 1000 non-null int64 11 Account_Type 1000 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 93.9+ KB
df.columns
Index(['Transaction_ID', 'Transaction_Amount', 'Transaction_Volume',
'Average_Transaction_Amount', 'Frequency_of_Transactions',
'Time_Since_Last_Transaction', 'Day_of_Week', 'Time_of_Day', 'Age',
'Gender', 'Income', 'Account_Type'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Transaction_ID 1000 non-null object 1 Transaction_Amount 1000 non-null float64 2 Transaction_Volume 1000 non-null int64 3 Average_Transaction_Amount 1000 non-null float64 4 Frequency_of_Transactions 1000 non-null int64 5 Time_Since_Last_Transaction 1000 non-null int64 6 Day_of_Week 1000 non-null object 7 Time_of_Day 1000 non-null object 8 Age 1000 non-null int64 9 Gender 1000 non-null object 10 Income 1000 non-null int64 11 Account_Type 1000 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 93.9+ KB
df.duplicated().sum()
0
df.isnull().sum()
Transaction_ID 0 Transaction_Amount 0 Transaction_Volume 0 Average_Transaction_Amount 0 Frequency_of_Transactions 0 Time_Since_Last_Transaction 0 Day_of_Week 0 Time_of_Day 0 Age 0 Gender 0 Income 0 Account_Type 0 dtype: int64
df
| Transaction_ID | Transaction_Amount | Transaction_Volume | Average_Transaction_Amount | Frequency_of_Transactions | Time_Since_Last_Transaction | Day_of_Week | Time_of_Day | Age | Gender | Income | Account_Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | TX0 | 1024.835708 | 3 | 997.234714 | 12 | 29 | Friday | 06:00 | 36 | Male | 1436074 | Savings |
| 1 | TX1 | 1013.952065 | 4 | 1020.210306 | 7 | 22 | Friday | 01:00 | 41 | Female | 627069 | Savings |
| 2 | TX2 | 970.956093 | 1 | 989.496604 | 5 | 12 | Tuesday | 21:00 | 61 | Male | 786232 | Savings |
| 3 | TX3 | 1040.822254 | 2 | 969.522480 | 16 | 28 | Sunday | 14:00 | 61 | Male | 619030 | Savings |
| 4 | TX4 | 998.777241 | 1 | 1007.111026 | 7 | 7 | Friday | 08:00 | 56 | Female | 649457 | Savings |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | TX995 | 901.138758 | 3 | 976.363229 | 17 | 19 | Monday | 05:00 | 46 | Female | 424746 | Savings |
| 996 | TX996 | 928.962516 | 4 | 1028.292292 | 10 | 25 | Wednesday | 09:00 | 58 | Female | 908278 | Current |
| 997 | TX997 | 950.921600 | 2 | 1022.823424 | 13 | 28 | Friday | 02:00 | 58 | Female | 1353498 | Current |
| 998 | TX998 | 933.291962 | 4 | 994.325450 | 10 | 8 | Tuesday | 04:00 | 58 | Female | 359072 | Savings |
| 999 | TX999 | 968.289340 | 3 | 979.078420 | 18 | 10 | Sunday | 10:00 | 31 | Female | 1101680 | Current |
1000 rows × 12 columns
fig_amount = px.histogram(df, x='Transaction_Amount',
nbins=30,
title='Distribution of Transaction Amount')
fig_amount.show()
fig_box_amount = px.box(df,
x='Account_Type',
y='Transaction_Amount',
title='Transaction Amount by Account Type')
fig_box_amount.show()
fig_scatter_avg_amount_age = px.scatter(df, x='Age',
y='Average_Transaction_Amount',
color='Account_Type',
title='Average Transaction Amount vs. Age'
)
fig_scatter_avg_amount_age.show()
import seaborn as sns
import matplotlib.pyplot as plt
corr=df.corr()
fig_corr_heatmap=sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()
<ipython-input-53-8f4f8f841fe1>:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
mean_amount=df['Transaction_Amount'].mean()
std_amount = df['Transaction_Amount'].std()
print(mean_amount)
print(std_amount)
1038.1225107502053 283.5800547153556
anomaly_threshold=mean_amount+2*std_amount
anomaly_threshold
1605.2826201809164
df['Is_Anomaly'] = df['Transaction_Amount'] > anomaly_threshold
color_map = {True: 'red', False: 'Green'}
fig_anomalies = px.scatter(df, x='Transaction_Amount', y='Average_Transaction_Amount',
color='Is_Anomaly', color_discrete_map=color_map,
title='Anomalies in Transaction Amount')
df['Color'] = df['Is_Anomaly'].map(color_map)
# Update the marker size for better visibility
fig_anomalies.update_traces(marker=dict(size=12),
selector=dict(mode='markers', marker_size=1))
# Show the plot
fig_anomalies.show()
num_anomalies=df['Is_Anomaly'].sum()
num_anomalies
20
total_instances = df.shape[0]
# Calculate the ratio of anomalies
anomaly_ratio = num_anomalies / total_instances
print(anomaly_ratio)
0.02
relevant_features = ['Transaction_Amount',
'Average_Transaction_Amount',
'Frequency_of_Transactions']
# Split data into features (X) and target variable (y)
X = df[relevant_features]
y = df['Is_Anomaly']
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the Isolation Forest model
model = IsolationForest(contamination=0.02, random_state=42)
model.fit(X_train)
IsolationForest(contamination=0.02, random_state=42)
y_pred = model.predict(X_test)
# Convert predictions to binary values (0: normal, 1: anomaly)
y_pred_binary = [1 if pred == -1 else 0 for pred in y_pred]
# Evaluate the model's performance
report = classification_report(y_test, y_pred_binary, target_names=['Normal', 'Anomaly'])
print(report)
precision recall f1-score support
Normal 1.00 1.00 1.00 196
Anomaly 1.00 1.00 1.00 4
accuracy 1.00 200
macro avg 1.00 1.00 1.00 200
weighted avg 1.00 1.00 1.00 200
# Relevant features used during training
relevant_features = ['Transaction_Amount', 'Average_Transaction_Amount', 'Frequency_of_Transactions']
# Get user inputs for features
user_inputs = []
for feature in relevant_features:
user_input = float(input(f"Enter the value for '{feature}': "))
user_inputs.append(user_input)
# Create a DataFrame from user inputs
user_df = pd.DataFrame([user_inputs], columns=relevant_features)
# Predict anomalies using the model
user_anomaly_pred = model.predict(user_df)
# Convert the prediction to binary value (0: normal, 1: anomaly)
user_anomaly_pred_binary = 1 if user_anomaly_pred == -1 else 0
if user_anomaly_pred_binary == 1:
print("Anomaly detected: This transaction is flagged as an anomaly.")
else:
print("No anomaly detected: This transaction is normal.")
Enter the value for 'Transaction_Amount': 2500 Enter the value for 'Average_Transaction_Amount': 2566 Enter the value for 'Frequency_of_Transactions': 34 Anomaly detected: This transaction is flagged as an anomaly.